{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a577777f-a994-493b-bf39-4d19f4fdc5f8",
   "metadata": {},
   "source": [
    "# Install Vllm  \n",
    "`pip install vllm` <br>\n",
    "or if you want to compile you can compile from <br>\n",
    "https://docs.vllm.ai/en/latest/getting_started/installation.html"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "00d46316-8399-4731-8e97-5d8c8c436d99",
   "metadata": {},
   "source": [
    "# Orca-7b Completion Example\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f92b41e-cec4-4769-af4d-3c9591d3d3a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ[\"HF_HOME\"] = \"model/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "319f3365-20fa-401a-af8e-882031739816",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.llms.vllm import Vllm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ebf60fa2-10bc-4bbe-8b9d-f1d94fac9a4b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-12-03 00:40:26,298\tINFO worker.py:1642 -- Started a local Ray instance.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 12-03 00:40:27 llm_engine.py:72] Initializing an LLM engine with config: model='microsoft/Orca-2-7b', tokenizer='microsoft/Orca-2-7b', tokenizer_mode=auto, revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=4, quantization=None, seed=0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:40:36,195 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438410240; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:40:46,207 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438385664; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:40:56,219 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438385664; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[36m(RayWorker pid=3872755)\u001b[0m Blocksparse is not available: the current GPU does not expose Tensor cores\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:41:06,230 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438377472; capacity: 232947798016. Object creation will fail if spilling is required.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 12-03 00:41:07 llm_engine.py:205] # GPU blocks: 898, # CPU blocks: 512\n"
     ]
    }
   ],
   "source": [
    "llm = Vllm(\n",
    "    model=\"microsoft/Orca-2-7b\",\n",
    "    tensor_parallel_size=4,\n",
    "    max_new_tokens=100,\n",
    "    vllm_kwargs={\"swap_space\": 1, \"gpu_memory_utilization\": 0.5},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ba42670-75b5-4080-8257-7cbb39085728",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.36s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[CompletionResponse(text=\"\\n Especially for a beginner, a black hole is a place where gravity is so strong that it pulls everything, even light, towards itself. Imagine a well with a bottomless pit, but in space. Instead of being able to see the bottom, the gravity of the black hole is so strong that it makes the bottom disappear. It's like a point where the laws of physics, as we know them, break down.\", additional_kwargs={}, raw=None, delta=None)]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:42:06,287 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438160384; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:42:16,297 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438156288; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:42:26,307 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438119424; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:42:36,316 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438111232; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:42:46,325 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438098944; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:42:56,334 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438098944; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:43:06,343 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438094848; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:43:16,352 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438082560; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 00:43:26,361 E 3871478 3871491] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_00-40-24_132284_3871321 is over 95% full, available space: 2438029312; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "*** SIGTERM received at time=1701544410 on cpu 27 ***\n",
      "PC: @     0x7faffa6bb46e  (unknown)  epoll_wait\n",
      "    @     0x7faffa906420  (unknown)  (unknown)\n",
      "[2023-12-03 00:43:30,340 E 3871321 3871321] logging.cc:361: *** SIGTERM received at time=1701544410 on cpu 27 ***\n",
      "[2023-12-03 00:43:30,340 E 3871321 3871321] logging.cc:361: PC: @     0x7faffa6bb46e  (unknown)  epoll_wait\n",
      "[2023-12-03 00:43:30,340 E 3871321 3871321] logging.cc:361:     @     0x7faffa906420  (unknown)  (unknown)\n"
     ]
    }
   ],
   "source": [
    "llm.complete(\n",
    "    [\"[INST]You are a helpful assistant[/INST] What is a black hole ?\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f67b22d6-37c0-4b42-b9c0-d259ee41fced",
   "metadata": {},
   "source": [
    "# LLama-2-7b Completion Example\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc529fdc-9801-4ae1-9a1f-7242b0224645",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING 12-03 01:02:02 config.py:341] Casting torch.bfloat16 to torch.float16.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-12-03 01:02:04,400\tINFO worker.py:1642 -- Started a local Ray instance.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 12-03 01:02:05 llm_engine.py:72] Initializing an LLM engine with config: model='codellama/CodeLlama-7b-hf', tokenizer='codellama/CodeLlama-7b-hf', tokenizer_mode=auto, revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=4, quantization=None, seed=0)\n",
      "INFO 12-03 01:02:05 tokenizer.py:30] For some LLaMA V1 models, initializing the fast tokenizer may take a long time. To reduce the initialization time, consider using 'hf-internal-testing/llama-tokenizer' instead of the original tokenizer.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:02:14,296 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434420736; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[36m(RayWorker pid=3882520)\u001b[0m Blocksparse is not available: the current GPU does not expose Tensor cores\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 12-03 01:02:22 llm_engine.py:205] # GPU blocks: 850, # CPU blocks: 512\n"
     ]
    }
   ],
   "source": [
    "llm = Vllm(\n",
    "    model=\"codellama/CodeLlama-7b-hf\",\n",
    "    dtype=\"float16\",\n",
    "    tensor_parallel_size=4,\n",
    "    temperature=0,\n",
    "    max_new_tokens=100,\n",
    "    vllm_kwargs={\n",
    "        \"swap_space\": 1,\n",
    "        \"gpu_memory_utilization\": 0.5,\n",
    "        \"max_model_len\": 4096,\n",
    "    },\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "558a8f57-427f-4fd2-a264-5a268e5667fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processed prompts:   0%|                                                                                                                                                              | 0/1 [00:00<?, ?it/s]\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:02:24,306 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434400256; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.64s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[CompletionResponse(text='\\n    \"\"\"\\n    Ping a host with exponential backoff.\\n    \"\"\"\\n    # Setup the socket\\n    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\\n    s.settimeout(1)\\n\\n    # Setup the exponential backoff\\n    backoff = 1\\n    max_backoff = 10\\n\\n    # Ping the host\\n    while True:\\n        try', additional_kwargs={}, raw=None, delta=None)]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:02:34,315 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434396160; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:02:44,324 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434392064; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:02:54,333 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434383872; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:03:04,342 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434355200; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:03:14,352 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434199552; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:03:24,361 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434174976; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:03:34,371 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434166784; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:03:44,380 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434162688; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:03:54,389 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434158592; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:04:04,398 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434117632; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:04:14,407 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434142208; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:04:24,416 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434134016; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:04:34,425 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434125824; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:04:44,434 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434121728; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:04:54,443 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434109440; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:05:04,453 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2434072576; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:05:14,461 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2433916928; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:05:24,471 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2433912832; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:05:34,480 E 3881242 3881255] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-02-02_227423_3880966 is over 95% full, available space: 2433904640; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "*** SIGTERM received at time=1701545737 on cpu 19 ***\n",
      "PC: @     0x7fd7e3e3246e  (unknown)  epoll_wait\n",
      "    @     0x7fd7e407d420  (unknown)  (unknown)\n",
      "[2023-12-03 01:05:37,122 E 3880966 3880966] logging.cc:361: *** SIGTERM received at time=1701545737 on cpu 19 ***\n",
      "[2023-12-03 01:05:37,122 E 3880966 3880966] logging.cc:361: PC: @     0x7fd7e3e3246e  (unknown)  epoll_wait\n",
      "[2023-12-03 01:05:37,122 E 3880966 3880966] logging.cc:361:     @     0x7fd7e407d420  (unknown)  (unknown)\n"
     ]
    }
   ],
   "source": [
    "llm.complete([\"import socket\\n\\ndef ping_exponential_backoff(host: str):\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5f3b7693-a0b2-46a0-ade8-1abb36691a49",
   "metadata": {},
   "source": [
    "# mistral chat 7b Completion Example\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ce09b14-b208-48e3-b779-b1f2605e901a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING 12-03 01:06:44 config.py:341] Casting torch.bfloat16 to torch.float16.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-12-03 01:06:47,045\tINFO worker.py:1642 -- Started a local Ray instance.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 12-03 01:06:48 llm_engine.py:72] Initializing an LLM engine with config: model='mistralai/Mistral-7B-Instruct-v0.1', tokenizer='mistralai/Mistral-7B-Instruct-v0.1', tokenizer_mode=auto, revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=4, quantization=None, seed=0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:06:56,943 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433503232; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:07:06,955 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433474560; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[36m(RayWorker pid=3885160)\u001b[0m Blocksparse is not available: the current GPU does not expose Tensor cores\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:07:16,966 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433470464; capacity: 232947798016. Object creation will fail if spilling is required.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 12-03 01:07:19 llm_engine.py:205] # GPU blocks: 2764, # CPU blocks: 2048\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:07:26,975 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433462272; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:07:36,984 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433458176; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:07:46,994 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433429504; capacity: 232947798016. Object creation will fail if spilling is required.\n"
     ]
    }
   ],
   "source": [
    "llm = Vllm(\n",
    "    model=\"mistralai/Mistral-7B-Instruct-v0.1\",\n",
    "    dtype=\"float16\",\n",
    "    tensor_parallel_size=4,\n",
    "    temperature=0,\n",
    "    max_new_tokens=100,\n",
    "    vllm_kwargs={\n",
    "        \"swap_space\": 1,\n",
    "        \"gpu_memory_utilization\": 0.5,\n",
    "        \"max_model_len\": 4096,\n",
    "    },\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f08d009-594c-4f21-a705-078493b74fa4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processed prompts:   0%|                                                                                                                                                              | 0/1 [00:00<?, ?it/s]\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:08:07,011 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433265664; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.30s/it]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[CompletionResponse(text='\\nA black hole is a region of space from which nothing, not even light, can escape. It is believed to be the result of the gravitational collapse of the remnants of an extremely massive star. The gravitational force within a black hole is incredibly strong and is due to the fact that a massive amount of matter has been squeezed into an incredibly small space. Because of this, black holes are extremely difficult to observe directly, as nothing can escape from them. Instead, scientists study the', additional_kwargs={}, raw=None, delta=None)]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:08:17,020 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433232896; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:08:27,029 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433200128; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:08:37,038 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433200128; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:08:47,048 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433163264; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:08:57,058 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433150976; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:09:07,067 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433146880; capacity: 232947798016. Object creation will fail if spilling is required.\n",
      "\u001b[2m\u001b[33m(raylet)\u001b[0m [2023-12-03 01:09:17,076 E 3883882 3883896] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-12-03_01-06-44_873224_3883710 is over 95% full, available space: 2433138688; capacity: 232947798016. Object creation will fail if spilling is required.\n"
     ]
    }
   ],
   "source": [
    "llm.complete([\" What is a black hole ?\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d02f871-56a1-4118-949a-2322177b58e5",
   "metadata": {},
   "source": [
    "## Completion Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "51569399-1d66-43b3-a707-71220d19cd58",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.llms.vllm import VllmServer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4fc8e4c-0207-4f23-a5e9-e0851186e9f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = VllmServer(\n",
    "    api_url=\"http://localhost:8000/generate\", max_new_tokens=100, temperature=0\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6b95731-0361-44fe-9219-bed582cb9cbe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[CompletionResponse(text='what is a black hole ?\\n\\nA black hole is a region in space where the gravitational pull is so strong that nothing, not even light, can escape from it. It is formed when a massive star collapses under its own gravity after it has exhausted its nuclear fuel. The boundary around the black hole, called the event horizon, marks the point of no return, beyond which anything that gets too close will be pulled in and cannot escape.', additional_kwargs={}, raw=None, delta=None)]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "llm.complete(\"what is a black hole ?\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "898eadce-8349-417e-9cdc-3299f69894d6",
   "metadata": {},
   "source": [
    "## Streaming Response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82b43656-2144-475b-bde3-fe830c9489e7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CompletionResponse(text='what is a black hole?\\n\\nA black hole is a region in space where the gravitational pull is so strong that nothing, not even light, can escape from it. It is formed when a massive star collapses under its own gravity after it has exhausted its nuclear fuel. The boundary around the black hole, called the event horizon, marks the point of no return, beyond which anything that gets too close will be pulled in and cannot escape.', additional_kwargs={}, raw=None, delta=None)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(llm.stream_complete(\"what is a black hole\"))[-1]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c078acbc-f2a8-4b3b-8d3c-89858adf431e",
   "metadata": {},
   "source": [
    "# Api Response\n",
    "To setup the api you can follow the guide present here -> https://docs.vllm.ai/en/latest/serving/distributed_serving.html"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d867ee6a-2a1b-4152-8616-1d2b507d9fab",
   "metadata": {},
   "source": [
    "## completion Response "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d25d595-e911-43f5-9538-865140f42234",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.llms.vllm import VllmServer\n",
    "from llama_index.llms import ChatMessage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4afbd046-bec9-497a-838e-b06bdbfa63db",
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = VllmServer(\n",
    "    api_url=\"http://localhost:8000/generate\", max_new_tokens=100, temperature=0\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e065bcc7-f806-4bde-9dbf-4fe7f71d24b0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CompletionResponse(text='what is a black hole ?\\nA black hole is a region of space-time where gravity is so strong that nothing, not even light, can escape. Black holes are formed when a massive star dies and its core collapses under the intense pressure of its own gravity. The core becomes infinitely dense and small, known as a singularity, and anything that gets too close to it is pulled in by the incredibly strong gravitational force. Black holes can have different masses, from small ones that are barely noticeable to', additional_kwargs={}, raw=None, delta=None)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "llm.complete(\"what is a black hole ?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "babda560-cc6a-427b-99bb-0ee451022752",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='user: hello\\nassistant: 안녕하세요! 어떻게 도움이 되겠습니까?', additional_kwargs={}), raw=None, delta=None, additional_kwargs={})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "message = [ChatMessage(content=\"hello\", author=\"user\")]\n",
    "llm.chat(message)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6236b3e7-090e-45e0-807c-c3f8fab365d0",
   "metadata": {},
   "source": [
    "## Streaming Response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "849ebcb5-7ff3-4686-b3f4-51ea73ec732d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CompletionResponse(text='what is a black hole?\\nA black hole is a region of space-time where gravity is so strong that nothing, not even light, can escape. Black holes are formed when a massive star dies and its core collapses under the intense pressure of its own gravity. They are invisible, but their presence can be detected by the effects of their gravity on nearby matter, such as stars and gas. Black holes are thought to be the most dense objects in the universe, with masses up to several times that of the', additional_kwargs={}, raw=None, delta=None)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(llm.stream_complete(\"what is a black hole\"))[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4513f794-d0cc-451b-bc79-538d76a8e058",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='user: what is a black hole\\nassistant: \\n\\nA black hole is a region of space-time where gravity is so strong that nothing, not even light, can escape. Black holes are formed when a massive star dies and its core collapses under the intense pressure of its own gravity. They are invisible, but their presence can be detected by the effects of their gravity on nearby matter, such as stars and gas. Black holes are thought to be the most dense objects in the universe, with masses up to several times that of the', additional_kwargs={}), raw=None, delta=None, additional_kwargs={})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "message = [ChatMessage(content=\"what is a black hole\", author=\"user\")]\n",
    "[x for x in llm.stream_chat(message)][-1]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71b8e03f-fcf5-4650-b701-585ca68f4bb8",
   "metadata": {},
   "source": [
    "## Async Response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "200af3f1-f82e-45dd-86b7-cd189b113048",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CompletionResponse(text='What is a black hole?\\nA black hole is a region of space-time where gravity is so strong that nothing, not even light, can escape. Black holes are formed from the remnants of massive stars that have collapsed onto themselves. They are invisible, but their presence can be detected by the effects of their gravity on nearby matter, such as stars and gas. Black holes come in different sizes, with the smallest being only a few miles across, and the largest being billions of times larger than our Sun', additional_kwargs={}, raw=None, delta=None)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "await llm.acomplete(\"What is a black hole\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7489cf16-28d0-465b-b684-07a72d4576d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='user: what is a black hole\\nassistant: \\n\\nA black hole is a region of space-time where gravity is so strong that nothing, not even light, can escape. Black holes are formed when a massive star dies and its core collapses under the intense pressure of its own gravity. They are invisible, but their presence can be detected by the effects of their gravity on nearby matter, such as stars and gas. Black holes are thought to be the most dense objects in the universe, with masses up to several times that of the', additional_kwargs={}), raw=None, delta=None, additional_kwargs={})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "await llm.achat(message)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5577120f-0dc0-494c-bfe9-008b145717c9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CompletionResponse(text='what is a black hole?\\nA black hole is a region of space-time where gravity is so strong that nothing, not even light, can escape. Black holes are formed when a massive star dies and its core collapses under the intense pressure of its own gravity. They are invisible, but their presence can be detected by the effects of their gravity on nearby matter, such as stars and gas. Black holes are thought to be the most dense objects in the universe, with masses up to several times that of the', additional_kwargs={}, raw=None, delta=None)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[x async for x in await llm.astream_complete(\"what is a black hole\")][-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78003ef4-ebb2-4d0a-b959-c4dee3714d30",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='user: what is a black hole\\nassistant: \\n\\nA black hole is a region of space-time where gravity is so strong that nothing, not even light, can escape. Black holes are formed when a massive star dies and its core collapses under the intense pressure of its own gravity. They are invisible, but their presence can be detected by the effects of their gravity on nearby matter, such as stars and gas. Black holes are thought to be the most dense objects in the universe, with masses up to several times that of the', additional_kwargs={}), raw=None, delta=None, additional_kwargs={})"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[x for x in await llm.astream_chat(message)][-1]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
